This project will predict whether an image contains a ship or an iceberg with machine learning.
# Data Manipulation
import pandas as pd
pd.options.display.max_columns = 25
import numpy as np
from IPython.display import display
# Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid", {"axes.grid": False})
import missingno as msno
# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss
#from xgboost import XGBClassifier
# test, train and submission files
test = pd.read_json('data/test.json')
test_df = pd.DataFrame(test)
train = pd.read_json('data/train.json')
train_df = pd.DataFrame(train)
submission_ex = pd.read_csv('data/sample_submission.csv')
shape info describe
train_df.shape
train_df.info()
train_df.describe()
train_df.head()
Looks like pandas read the inc_angle as text. We'll have to convert that to a float.
train_df.replace('na', np.NaN, inplace=True)
train_df['inc_angle'] = pd.to_numeric(train_df['inc_angle'])
train_df.info()
msno.matrix(train_df)
Plot a sample image and an 8 by 8 array of samples
val_counts = train_df.is_iceberg.value_counts()
plt.figure(figsize=(8,6))
plt.bar(
val_counts.index,
val_counts.values
)
plt.xticks(range(2), ['Ships', 'Icebergs'])
plt.title('Class Distribution')
plt.show()
val_counts = train_df.inc_angle.value_counts()
plt.figure(figsize=(8,6))
sns.distplot(
train_df.inc_angle.dropna()
)
plt.show()
band_1_ex = train_df.loc[0, 'band_1']
band_1_ex = np.array(band_1_ex)
band_1_square = band_1_ex.reshape(75, 75)
plt.imshow(band_1_square)
plt.show()
band_2_ex = train_df.loc[0, 'band_2']
band_2_ex = np.array(band_2_ex)
band_2_square = band_2_ex.reshape(75, 75)
plt.imshow(band_2_square)
plt.show()
band_1_ex = train_df.loc[0, 'band_1']
band_2_ex = train_df.loc[0, 'band_2']
band_sub_1 = np.array(band_1_ex)
band_sub_2 = np.array(band_2_ex)
band_sub = band_sub_1 - band_sub_2
band_sub_square = band_sub.reshape(75, 75)
plt.imshow(band_sub_square)
plt.show()
In the example pics, the contrast between the hh and hv iceberg images is greater than the contrast between the hh and hv ship images. Icebergs seem to "disappear" under hv images, while ships are clearly visible in both hv and hh images. That difference in contrast may be the key.
def plot_image_grid(train, band_type, n_row, n_col):
band = 'band_1'
if 'HV' in band_type:
band = 'band_2'
classification = ''
# Plot the first 8 eignenvalues
plt.figure(figsize=(12,12))
for i in list(range(n_row * n_col)):
# for offset in [10, 30,0]:
# plt.subplot(n_row, n_col, i + 1)
# offset =0
sat_image = train.loc[i, band]
sat_image = np.array(sat_image)
plt.subplot(n_row, n_col, i + 1)
plt.imshow(sat_image.reshape(75,75))
if train.loc[i, 'is_iceberg']:
classification = 'Iceberg'
else:
classification = 'Ship'
title_text = "{0} {1:.0f} {2}".format(classification, train.loc[i, 'inc_angle'], '$^\circ$')
plt.title(title_text, size=6.5)
plt.xticks(())
plt.yticks(())
plt.suptitle(band_type)
plt.show()
Plot ships and icebergs separately. Show ship hh and hv images, then iceberg hh and hv images to get a better feel for their differences.
plot_image_grid(
train_df[train_df.is_iceberg==0].reset_index(drop=True),
'HH Ships',
7,
7
)
plot_image_grid(
train_df[train_df.is_iceberg==0].reset_index(drop=True),
'HV Ships',
7,
7
)
plot_image_grid(
train_df[train_df.is_iceberg==1].reset_index(drop=True),
'HH Icebergs',
7,
7
)
plot_image_grid(
train_df[train_df.is_iceberg==1].reset_index(drop=True),
'HV Icebergs',
7,
7
)
Need a way to feed the data from the two images into the classifier.
First thought: Subtract the two arrays and use the incidence angle as just another column.
Other ideas:
band_1_arr = train_df.band_1.apply(np.asarray)
band_2_arr = train_df.band_2.apply(np.asarray)
band_diff = band_1_arr - band_2_arr
band_ave = (band_1_arr + band_2_arr)/2.0
band_ave_series = band_ave.apply(pd.Series)
band_diff_series = band_diff.apply(pd.Series)
band_1_df = band_1_arr.apply(pd.Series)
band_2_df = band_2_arr.apply(pd.Series)
band_ave_df = train_df.copy()
band_ave_df.band_1 = band_ave
plot_image_grid(band_ave_df, 'Average Images', 7, 7)
band_diff_df = train_df.copy()
band_diff_df.band_1 = band_diff
plot_image_grid(band_diff_df, 'Difference Images', 7, 7)
Second thought: Turn the images into 3d arrays
Need to explore digit recognition notebooks on:
What increases accuracy from 94% to 99.9%?
Research!
split into x and y
hh_idea = pd.concat((band_1_df, train_df.inc_angle), axis=1)
forest = RandomForestClassifier(random_state=42)
x = hh_idea[hh_idea.inc_angle.isnull()==False].values
y = train_df.loc[train_df.inc_angle.isnull()==False, 'is_iceberg'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)
forest.fit(x_train, y_train)
pred = forest.predict(x_test)
pred_prob = forest.predict_proba(x_test)
print "Log Loss:", log_loss(y_test, pred_prob, eps=1e-15)
print "Accuracy Score:", accuracy_score(y_test, pred)
print "Classification Report"
print classification_report(y_test, pred)
hh_conf_matrix = confusion_matrix(y_test, pred)
hh_idea = pd.concat((band_1_df, train_df.inc_angle), axis=1)
logreg = LogisticRegression(random_state=42)
x = hh_idea[hh_idea.inc_angle.isnull()==False].values
y = train_df.loc[train_df.inc_angle.isnull()==False, 'is_iceberg'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)
logreg.fit(x_train, y_train)
pred = logreg.predict(x_test)
print "Accuracy Score:", accuracy_score(y_test, pred)
print "Classification Report"
print classification_report(y_test, pred)
hh_conf_matrix = confusion_matrix(y_test, pred)
hv_idea = pd.concat((band_2_df, train_df.inc_angle), axis=1)
forest = RandomForestClassifier(random_state=42)
x = hv_idea[hv_idea.inc_angle.isnull()==False].values
y = train_df.loc[train_df.inc_angle.isnull()==False, 'is_iceberg'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)
forest.fit(x_train, y_train)
pred = forest.predict(x_test)
print "Classification Report"
print classification_report(y_test, pred)
hv_conf_matrix = confusion_matrix(y_test, pred)
first_idea = pd.concat((band_diff_series, train_df.inc_angle), axis=1)
forest = RandomForestClassifier(random_state=42)
x = first_idea[first_idea.inc_angle.isnull()==False].values
y = train_df.loc[train_df.inc_angle.isnull()==False, 'is_iceberg'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)
forest.fit(x_train, y_train)
pred = forest.predict(x_test)
print "Classification Report"
print classification_report(y_test, pred)
diff_conf_matrix = confusion_matrix(y_test, pred)
average_idea = pd.concat((band_ave_series, train_df.inc_angle), axis=1)
forest = RandomForestClassifier(random_state=42)
x = average_idea[average_idea.inc_angle.isnull()==False].values
y = train_df.loc[train_df.inc_angle.isnull()==False, 'is_iceberg'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)
forest.fit(x_train, y_train)
pred = forest.predict(x_test)
pred_prob = forest.predict_proba(x_test)
print "Log Loss:", log_loss(y_test, pred_prob, eps=1e-15)
print "Accuracy Score:", accuracy_score(y_test, pred)
print "Classification Report"
print classification_report(y_test, pred)
ave_conf_matrix = confusion_matrix(y_test, pred)
def reshape_square(arr):
return arr.reshape(75,75)
train_drop = train_df.dropna()
band_1_list = train_drop.band_1.tolist()
band_1_long_arr = np.array(band_1_list)
band_1_square = band_1_long_arr.reshape(1471, 75, 75)
band_2_list = train_drop.band_2.tolist()
band_2_long_arr = np.array(band_2_list)
band_2_square = band_2_long_arr.reshape(1471, 75, 75)
train_ave_temp = train_df.copy()
train_ave_temp.band_1 = band_ave
band_ave = train_ave_temp.dropna().band_1
band_ave_list = band_ave.apply(list)
band_ave_list = band_ave_list.tolist()
band_ave_long_arr = np.array(band_1_list)
band_ave_square = band_1_long_arr.reshape(1471, 75, 75)
band_ave_square[:, :, :, np.newaxis].shape
x_square = np.concatenate(
(
band_1_square[:, :, :, np.newaxis],
band_2_square[:, :, :, np.newaxis],
band_ave_square[:, :, :, np.newaxis]),
axis=3
)
x_square.shape
band_1_square[:, :, :, np.newaxis].shape
band_2_square[:, :, :, np.newaxis].shape
band_ave_square[:, :, :, np.newaxis].shape
x_angle = train_df.dropna().inc_angle.values
y = train_df.dropna().is_iceberg.values
x_train, x_test, x_angle_train, x_angle_test, y_train, y_test = train_test_split(
x_square,
x_angle,
y,
random_state=42,
test_size=.3
)
from sklearn.datasets import load_iris
iris = load_iris()
x = iris.data
x.shape
forest = RandomForestClassifier(random_state=42)
forest.fit([x_train, x_angle_train], y_train)
pred = forest.predict([x_test, x_angle_test])
pred_prob = forest.predict_proba([x_test, x_angle_test])
print "Log Loss:", log_loss(y_test, pred_prob, eps=1e-15)
print "Accuracy Score:", accuracy_score(y_test, pred)
print "Classification Report"
print classification_report(y_test, pred)
ave_conf_matrix = confusion_matrix(y_test, pred)
def plot_conf_matrix(conf_matrix, title):
conf_df = pd.DataFrame(conf_matrix)
conf_df.columns = ['Ship', 'Iceberg']
conf_df.index = ['Ship', 'Iceberg']
sns.heatmap(
conf_df,
square=True,
annot=True,
cmap='viridis',
fmt='0g'
)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.suptitle(title)
plt.show()
plot_conf_matrix(hh_conf_matrix, 'HH Model')
plot_conf_matrix(hv_conf_matrix, 'HV Model')
plot_conf_matrix(diff_conf_matrix, 'Diff Model')
plot_conf_matrix(ave_conf_matrix, 'Average Model')
You want the second column from the pred_prob array. That turns into the is_iceberg column. Also need to keep track of the ids.
Convert band_1 to its own dataframe
band_1_test_arr = test_df.band_1.apply(np.asarray)
band_1_test_df = band_1_test_arr.apply(pd.Series)
Extract inc_angle from test set and concat with band_1
hh_submission = pd.concat((band_1_test_df, test_df.inc_angle), axis=1)
Run predict_proba on array
x = hh_submission.values
pred = forest.predict(x)
pred_prob = forest.predict_proba(x)
Convert pred_prob to dataframe for submission
submission = pd.DataFrame(pred_prob)
submission.columns = ['not_iceberg', 'is_iceberg']
Extract the right column from each predict_proba row
submission = submission.is_iceberg
submission = pd.concat((test_df.id, submission), axis=1)
Check to make sure dataframe is formatted correctly
submission.head()
Convert to csv for submission
submission.to_csv('data/submission_v1.csv', index=False)
submission.shape
submission.head()
a = np.array([[[0,1,2]]])
b = np.array([[[3,4,5]]])
np.concatenate((a,b), axis=2)
np.concatenate((a,b), axis=2).shape
np.concatenate((a,b), axis=1)
np.concatenate((a,b), axis=1).shape
np.concatenate((a,b), axis=0)
np.concatenate((a,b), axis=0).shape
c = np.array([[0,1,2]])
d = np.array([[3,4,5]])
np.concatenate((c,d), axis=1)
np.concatenate((c,d), axis=1).shape
np.concatenate((c,d), axis=0)
np.concatenate((c,d), axis=0).shape